library(knitr)

Loading the data

# data url =http://bit.ly/CarreFourSalesDataset
sales<- read.csv('http://bit.ly/CarreFourSalesDataset',na.strings = "")
# printing the top six rows of the dataset
head(sales)
##        Date    Sales
## 1  1/5/2019 548.9715
## 2  3/8/2019  80.2200
## 3  3/3/2019 340.5255
## 4 1/27/2019 489.0480
## 5  2/8/2019 634.3785
## 6 3/25/2019 627.6165
# Previewing the datatypes of our data
str(sales)
## 'data.frame':    1000 obs. of  2 variables:
##  $ Date : chr  "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Sales: num  549 80.2 340.5 489 634.4 ...
#checking the size/shape of a data frame

dim(sales)
## [1] 1000    2

Loading the libraries

library(anomalize)
## == Use anomalize to improve your Forecasts by 50%! =============================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tibbletime)
## 
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
## 
##     filter
library(timetk)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks tibbletime::filter(), stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()

Data Cleaning

# Checking for missing values
is.null(sales)
## [1] FALSE

There are no missing values.

# checking for duplicates
anyDuplicated(sales)
## [1] 0

There are no duplicated values.

# Checking for outliers
#boxplot(sales)

Few outliers have been detected in the sales column

Implementing the solution

# changing date column to date type
sales$Date <- as.Date(sales$Date, format = "%m/%d/%y")
sales$Date <- as.POSIXct(sales$Date)
# Changing the dataframe to tibble

df <- as_tibble(sales)
class(df)
## [1] "tbl_df"     "tbl"        "data.frame"
# Previewing our tibble
head(df)
## # A tibble: 6 x 2
##   Date                Sales
##   <dttm>              <dbl>
## 1 2020-01-05 03:00:00 549. 
## 2 2020-03-08 03:00:00  80.2
## 3 2020-03-03 03:00:00 341. 
## 4 2020-01-27 03:00:00 489. 
## 5 2020-02-08 03:00:00 634. 
## 6 2020-03-25 03:00:00 628.
# Checking for any null values
is.null(df)
## [1] FALSE
df <- na.omit(df) 
# Using timetk to detect and visualize any anomalies.
df %>% timetk::plot_anomaly_diagnostics(Date,Sales, .facet_ncol = 2)
## frequency = 11 observations per 1 hour
## trend = 20 observations per 12 hours
# To find the exact data points that are anomalies, we use tk_anomaly_diagnostics() function.

df <- df %>% timetk::tk_anomaly_diagnostics(Date,Sales) %>% filter(anomaly=='Yes')
## frequency = 11 observations per 1 hour
## trend = 20 observations per 12 hours
df
## # A tibble: 0 x 11
## # ... with 11 variables: Date <dttm>, observed <dbl>, season <dbl>,
## #   trend <dbl>, remainder <dbl>, seasadj <dbl>, remainder_l1 <dbl>,
## #   remainder_l2 <dbl>, anomaly <chr>, recomposed_l1 <dbl>, recomposed_l2 <dbl>

The frequency is 11 observations per hour and a trend of 20 observations per 12 hours. The we conclude that there were no no anomalies on the sales trends.

# Using the anomalized package to detect anomalies.
#df_anomalized <- df %>%
#    time_decompose(Sales, merge = TRUE) %>%
#    anomalize(remainder) %>%
# #   time_recompose()
#df_anomalized %>% glimpse()